This R challenge is about the history of Olympic Games. The following four data sets are needed for this project:
Load all packages that you need here.
library(tidyverse)
library(lubridate)
library(stringr)Read in the results data set. Then count the total number of observations per country, and show the top 10 countries
results <- read_csv(file = "results.csv")
results%>%
group_by(country)%>%
summarise(count = n())%>%
arrange(desc(count))%>%
slice(1:10)Count the total number of observations for Germany in all Winter Games. Note: Germany appears in the data with 3 different names (and codes), due to the fact that Germany was split into West and East.
results %>%
filter(str_detect(country, "Germany") & (season == "Winter"))%>%
summarise(n = n())Read in the data set athletes.csv and join it with the results. Briefly explain which join type you are using and why.
athletes <- athletes <- read_delim("athletes.csv", delim = ";",
escape_double = FALSE, trim_ws = TRUE)
athletes_results <- inner_join(athletes, results, by = "athlete")
# Inner join was used because I only want results where we know which athlete participated in the gamesUsing the joined data, calculate the average height and the average weight of all male participants.
#separate the measurements attribute into heights and weights attributes
heights_weights <- athletes_results %>%
separate(measurements, into = c("heights", "weights"), sep ="/")%>%
mutate(heights = as.numeric(str_extract(heights, "\\d+")),
weights = as.numeric(str_extract(weights, "\\d+")))
heights_weights# To get the average height and average weight of male participants
heights_weights%>%
select(athlete, sex, heights, weights) %>%
filter(sex == "Male") %>%
distinct(athlete, .keep_all = TRUE) %>%
summarise(Avg_height = mean(heights, na.rm = TRUE),
Avg_weight = mean(weights, na.rm = TRUE))Identify the tallest athlete (regarding variable height) and show all available information for this athlete in the joined data set.
heights_weights%>%
filter(heights == max(heights, na.rm = TRUE))Read in the data set games.tsv and join it with your existing data set. Briefly explain: For which Olympic Games do you have metadata, but no results? What is the reason for the missing results?
games <- read_tsv("games.tsv")
athletes_results_games <- left_join(athletes_results, games,
by =c("season", "year"))
athletes_results_gamesanti_join(games, athletes_results, by =c("season","year"))# The reason for the missing result is because it has a matching value in gamesCreate a new variable that contains the athletes’ birthday, formatted as a date column. Then create another variable that holds the age of athletes in years at the opening of the Olympic Games.
#separate born variable into birthday and birth city variables
birthday <- athletes_results_games%>%
separate(born, c("birthday", "birth_city"), "in")%>%
mutate(birthday = dmy(birthday))
birthday#To get the age of the athlete at the opening of the olympic games
df_age <- birthday%>%
mutate (age = round(interval(birthday, games_opened) / dyears(1)))
df_ageCalculate the average age per sport of the female participants. Then print out a ranking of the 10 sports with the lowest average age
df_female_age <- df_age %>%
group_by(sport)%>%
filter(sex == "Female" & !is.na(age))%>%
summarize(avg_age = mean(age), n = n())%>%
arrange(avg_age)
df_female_age[1:10, ]Calculate the medal table of the Olympic Summer Games 2016 in Rio de Janeiro and display the top 10 countries (ordered by Gold, Silver, and then Bronze medals). Your final table should look like this: https://en.wikipedia.org/wiki/2016_Summer_Olympics_medal_table. Hint 1: In team sports such as Handball, many players receive a gold medal, but for the countries’ medal table it only counts as one gold medal. You can recognize team sports by the fact that the variable team is not missing.
summer_games_2016 <- athletes_results_games%>%
filter(year == 2016 & season == "Summer" )%>%
pivot_wider(names_from = medal, values_from = medal)
summer_games_2016_individual <- summer_games_2016 %>%
filter(is.na(team)) %>%
group_by(country)%>%
summarise(Gold = sum(!is.na(Gold)),
Silver = sum(!is.na(Silver)),
Bronze = sum(!is.na(Bronze)))
summer_games_2016_team <- summer_games_2016 %>%
filter(!is.na(team)) %>%
select(sport, discipline, country, Gold, Silver, Bronze)%>%
distinct(.keep_all = TRUE)%>%
group_by(country)%>%
summarise(Gold = sum(!is.na(Gold)),
Silver = sum(!is.na(Silver)),
Bronze = sum(!is.na(Bronze)))
medal_table_summer_2016 <- summer_games_2016_individual%>%
left_join(summer_games_2016_team, by ="country")
# replace all NA with 0 in order to calculate the sum of the medals
medal_table_summer_2016[is.na(medal_table_summer_2016)] = 0
medal_table_summer_2016 <- medal_table_summer_2016 %>%
mutate(Gold = Gold.x + Gold.y,
Silver = Silver.x + Silver.y,
Bronze = Bronze.x + Bronze.y)%>%
select(country, Gold, Silver, Bronze)%>%
mutate(Total = Gold + Silver + Bronze) %>%
arrange(desc(Gold),desc(Silver), desc(Bronze))
medal_table_summer_2016[1:10, ]Some of the athletes have started for multiple countries (e.g. due to migration or other reasons). Show all athletes that have started for at least 4 countries. Then choose one of these athletes and display the 4 countries for which he or she started.
athletes_results_games%>%
mutate(UID = paste(name, athlete, country)) %>%
distinct(UID, .keep_all = TRUE) %>%
group_by(name, athlete) %>%
summarise(name = first(name), countries = n()) %>%
arrange(countries) %>%
filter(countries >= 4)#To display different countries for which the athlete has started
athletes_results_games%>%
filter(athlete == 59866)%>%
select(country)